from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql.functions import *

if __name__ == '__main__':
    # 创建环境
    spark = SparkSession \
        .builder \
        .master("local") \
        .getOrCreate()

    # 读取数据
    linesDF = spark \
        .read \
        .format("csv") \
        .option("sep", "|") \
        .schema("line STRING") \
        .load("../data/words.txt")

    # 一行转换成多行
    selectDF = linesDF.select(explode(split(linesDF.line, ",")))

    # 分组统计单词的数量
    selectDF.groupBy(selectDF.col).agg(count(selectDF.col)).show()
